downloaded from CDC National Health and Examination Survey, NHANES Questionnaires, Datasets, and Related Documentation
# 2001-2002
demographics01 <- read.xport(here("data","DEMO_B_2001_2002.XPT"))
phthalates01 <- read.xport(here("data","PHPYPA_B_2001_2002.XPT"))
nhanes01 <- merge(phthalates01,demographics01,all.x=T)
# 2003-2004
demographics03 <- read.xport(here("data","DEMO_C_2003_2004.XPT"))
phthalates03 <- read.xport(here("data","L24PH_C_2003_2004.XPT"))
nhanes03 <- merge(phthalates03,demographics03,all.x=T)
# 2005-2006
demographics05 <- read.xport(here("data","DEMO_D_2005_2006.XPT"))
phthalates05 <- read.xport(here("data","PHTHTE_D_2005_2006.XPT"))
nhanes05 <- merge(phthalates05,demographics05,all.x=T)
# 2007-2008
demographics07 <- read.xport(here("data","DEMO_E_2007_2008.XPT"))
phthalates07 <- read.xport(here("data","PHTHTE_E_2007_2008.XPT"))
nhanes07 <- merge(phthalates07,demographics07,all.x=T)
# 2009-2010
demographics09 <- read.xport(here("data","DEMO_F_2009_2010.XPT"))
phthalates09 <- read.xport(here("data","PHTHTE_F_2009_2010.XPT"))
nhanes09 <- merge(phthalates09,demographics09,all.x=T)
# 2011-2012
demographics11 <- read.xport(here("data","DEMO_G_2011_2012.XPT"))
phthalates11 <- read.xport(here("data","PHTHTE_G_2011_2012.XPT"))
nhanes11 <- merge(phthalates11,demographics11,all.x=T)
# 2013-2014
demographics13 <- read.xport(here("data","DEMO_H_2013_2014.XPT"))
phthalates13 <- read.xport(here("data","PHTHTE_H_2013_2014.XPT"))
nhanes13 <- merge(phthalates13,demographics13,all.x=T)
# 2015-2016
demographics15 <- read.xport(here("data","DEMO_I_2015_2016.XPT"))
phthalates15 <- read.xport(here("data","PHTHTE_I_2015_2016.XPT"))
nhanes15 <- merge(phthalates15,demographics15,all.x=T)
## add a column with the year to each data file:
nhanes01$year <- 2001
nhanes03$year <- 2003
nhanes05$year <- 2005
nhanes07$year <- 2007
nhanes09$year <- 2009
nhanes11$year <- 2011
nhanes13$year <- 2013
nhanes15$year <- 2015
## choose the variables to select from each data file:
var<-c('year','DMDEDUC3','DMDEDUC2','DMDHREDU','DMDHSEDU','RIDAGEYR','RIAGENDR','RIDRETH1','INDFMPIR','DMDYRSUS','DMDCITZN','URXMEP')
nhanes01s<-subset(nhanes01,select=var)
nhanes03s<-subset(nhanes03,select=var)
nhanes05s<-subset(nhanes05,select=var)
nhanes07s<-subset(nhanes07,select=var)
nhanes09s<-subset(nhanes09,select=var)
nhanes11s<-subset(nhanes11,select=var)
nhanes13s<-subset(nhanes13,select=var)
nhanes15s<-subset(nhanes15,select=var)
## create one dataset for NHANES demographics and phthalates, year 2001-2016
nhanes <- rbind(nhanes01s,nhanes03s,nhanes05s,nhanes07s,nhanes09s,nhanes11s,nhanes13s,nhanes15s)
## create a data file so that I don't need to load, clean, and select the data again
# has all demographics and URXMEP phthalate for NHANES year 2001-2016
write.csv(nhanes, "nhanes_december22.csv")
After inspecting the phthalate data in its raw form and the log form, we determined it is necessary to log transform the data in order to visualize it.
hist(nhanes$URXMEP)
hist(log(nhanes$URXMEP))
nhanes$log_monoethyl = log(nhanes$URXMEP)
Original: - year = year the observation took place *this was added by me when I merged the years of data - DMDEDUC3 = the highest grade level of education completed by participants 6-19 y.o. - DMDEDUC2 = the highest grade/level of education completed by participants 20 years and older - DMDHREDU = the household reference person’s education level - DMDHSEDU = the household reference person’s spouse’s education level - RIDAGEYR = ages in years at screening - RIAGENDR = gender - RIDRETH1 = race/Hispanic origin - INDFMPIR = ratio of family income to poverty - DMDYRSUS = length of time in U.S. - DMDCITZN = citizenship status - URXMEP = mono-ethyl phthalate (ng/mL)
hist(nhanes$DMDHREDU)
# add "edu_ref" to "nhanes"
nhanes$edu_ref = factor(ifelse(nhanes$DMDHREDU %in% 1:4,"partial college and below",
ifelse(nhanes$DMDHREDU==5,"college and beyond",NA)),
levels=c("partial college and below","college and beyond"))
summary(nhanes$edu_ref)
hist(nhanes$DMDHSEDU)
# add "edu_spouse_ref" to "nhanes"
nhanes$edu_spouse_ref = factor(ifelse(nhanes$DMDHSEDU %in% 1:4,"partial college and below",
ifelse(nhanes$DMDHSEDU==5,"college and beyond",NA)),
levels=c("partial college and below","college and beyond"))
summary(nhanes$edu_spouse_ref)
hist(nhanes$DMDEDUC3)
# add "edu_child_participants" to "nhanes"
nhanes$edu_child_participants = factor(ifelse(nhanes$DMDEDUC3 %in% 0:8,"primary",
ifelse(nhanes$DMDEDUC3 %in% 9:15,"secondary",NA)),
levels=c("primary","secondary"))
summary(nhanes$edu_child_participants)
## not a good variable d/t 14,755 "refused," "don't know," "missing,
hist(nhanes$DMDEDUC2)
# add "edu_adult_participants" to "nhanes"
nhanes$edu_adult_participants = factor(ifelse(nhanes$DMDEDUC2 ==1, "less than 9th grade",
ifelse(nhanes$DMDEDUC2 ==2, "9-11th grade",
ifelse(nhanes$DMDEDUC2 ==3, "high school grad/GED",
ifelse(nhanes$DMDEDUC2 ==4, "some college or AA",
ifelse(nhanes$DMDEDUC2 ==5, "college grad or above", NA))))))
summary(nhanes$edu_adult_participants)
Based on the summary statistics of the variable “RIDAGEYR”: the youngest participant is 3 years old, the oldest participant is 85 years old, and the median age is 31 years old.
hist(nhanes$RIDAGEYR)
summary(nhanes$RIDAGEYR)
# add "age" to "nhanes"
nhanes$age = factor(ifelse(nhanes$RIDAGEYR >=65,"older adult",
ifelse(nhanes$RIDAGEYR >25 & nhanes$RIDAGEYR<64,"middle-aged",
ifelse(nhanes$RIDAGEYR >=19 & nhanes$RIDAGEYR<=25,"young adult",
ifelse(nhanes$RIDAGEYR <=18,"child",NA)))),
levels=c("child","young adult", "middle-aged", "older adult"))
summary(nhanes$age)
hist(nhanes$RIAGENDR)
# add "gender" to "nhanes"
nhanes$gender = factor(ifelse(nhanes$RIAGENDR ==1, "male",
ifelse(nhanes$RIAGENDR ==2, "female", NA)),
levels=c("male", "female"))
summary(nhanes$gender)
hist(nhanes$RIDRETH1)
# add "race/ethnicity" to "nhanes"
nhanes$ethnicity = factor(ifelse(nhanes$RIDRETH1 ==1, "Mexican American",
ifelse(nhanes$RIDRETH1 ==2, "Other Hispanic",
ifelse(nhanes$RIDRETH1 ==3, "Non-Hispanic White",
ifelse(nhanes$RIDRETH1 ==4, "Non-Hispanic Black",
ifelse(nhanes$RIDRETH1 ==5, "Other or Multi", NA))))),
levels=c("Non-Hispanic White", "Non-Hispanic Black", "Mexican American", "Other Hispanic", "Other or Multi"))
summary(nhanes$ethnicity)
hist(nhanes$DMDCITZN)
# add "citizenship status" to "nhanes"
nhanes$citizenship = factor(ifelse(nhanes$DMDCITZN ==1, "birth or naturalization",
ifelse(nhanes$DMDCITZN ==2, "not U,S, citizen", NA)),
levels=c("birth or naturalization", "not U,S, citizen"))
summary(nhanes$citizenship)
hist(nhanes$DMDYRSUS)
# add length of time in the U.S. to "nhanes"
nhanes$years_in_US = factor(ifelse(nhanes$DMDYRSUS ==1, "less than 1 year",
ifelse(nhanes$DMDYRSUS ==2, "1 year or more, but less than 5 years",
ifelse(nhanes$DMDYRSUS ==3, "5 years or more, but less than 10 years",
ifelse(nhanes$DMDYRSUS ==4, "10 years or more, but less than 15 years",
ifelse(nhanes$DMDYRSUS ==5, "15 years or more, but less than 20 years",
ifelse(nhanes$DMDYRSUS ==6, "20 years or more, but less than 30 years",
ifelse(nhanes$DMDYRSUS ==7, "30 years or more, but less than 40 years",
ifelse(nhanes$DMDYRSUS ==8, "40 years or more, but less than 50 years",
ifelse(nhanes$DMDYRSUS ==9, "50 years or more", NA))))))))))
summary(nhanes$years_in_US)
## not a very good variable, d/t 18,035 "refused," "don't know", or "missing"
hist(nhanes$INDFMPIR)
# add ratio of family income to poverty level
nhanes$income_to_poverty_ratio = factor(ifelse(nhanes$INDFMPIR < 1,"at poverty threshold",
ifelse(nhanes$INDFMPIR >=1 & nhanes$INDFMPIR <2, "family income 2x poverty threshold",
ifelse(nhanes$INDFMPIR >=2 & nhanes$INDFMPIR <3, "family income 3x poverty threshold",
ifelse(nhanes$INDFMPIR >=3 & nhanes$INDFMPIR <4, "family income 4x poverty threshold",
ifelse(nhanes$INDFMPIR >=4 & nhanes$INDFMPIR <5, "family income 5x poverty threshold",
ifelse(nhanes$INDFMPIR ==5, "family income more than 5x poverty threshold", NA)))))),
levels=c("at poverty threshold", "family income 2x poverty threshold", "family income 3x poverty threshold", "family income 4x poverty threshold", "family income 5x poverty threshold", "family income more than 5x poverty threshold"))
summary(nhanes$income_to_poverty_ratio)
# Household reference person education level and Mono-ethyl phthalate
box_edu_ref <- ggplot(data = nhanes, aes(x=log_monoethyl, y=edu_ref, fill=edu_ref)) +
scale_fill_brewer(palette="PuBuGn") +
geom_boxplot() +
theme(text = element_text(size=12)) +
xlab("(logged) Mono-Ethyl Phthalate Level (ng/mL)") +
ylab("Household Reference Person's Education Level") +
ggtitle("Household Ref. Person's Education Level and Logged Phthalate Level")
box_edu_ref
# Family Income and Mono-ethyl phthalate
box_poverty <- ggplot(data = nhanes, aes(x=log_monoethyl, y=income_to_poverty_ratio, fill=income_to_poverty_ratio)) +
scale_fill_brewer(palette="PuBuGn") +
geom_boxplot() +
theme(text = element_text(size=12)) +
xlab("(logged) Mono-Ethyl Phthalate Level (ng/mL)") +
ylab("Ratio of Participant's Family Income to Poverty") +
ggtitle("Poverty and Logged Phthalate Level")
box_poverty
# Age and Mono-ethyl phthalate
box_age <- ggplot(data = nhanes, aes(x=log_monoethyl, y=age, fill=age)) +
scale_fill_brewer(palette="PuBuGn") +
geom_boxplot() +
theme(text = element_text(size=12)) +
xlab("(logged) Mono-Ethyl Phthalate Level (ng/mL)") +
ylab("Age of Participant") +
ggtitle("Age and Logged Phthalate Level")
box_age
# multivariate regression analysis of socio-demographic variables and phthalates
modela <- lm(log_monoethyl ~ edu_ref + age + gender + ethnicity + income_to_poverty_ratio +
citizenship, data = nhanes)
summary(modela)
# take out gender
modelb <- lm(log_monoethyl ~ edu_ref + age + ethnicity + income_to_poverty_ratio +
citizenship, data = nhanes)
summary(modelb)
# take out gender, citizenship
modelc <- lm(log_monoethyl ~ edu_ref + age + ethnicity + income_to_poverty_ratio, data = nhanes)
summary(modelc)
The Bayesian Information Criterion (BIC),
BIC_list <- c(BIC(modela), BIC(modelb), BIC(modelc))
model_output <- rbind(data.frame(glance(modela)), data.frame(glance(modelb)), data.frame(glance(modelc))) %>% select(BIC)
model_output <- mutate(model_output, delta.BIC = BIC-min(BIC_list))
model_output$model <- c("Model A", "Model B", "Model C")
model_output <- model_output[,c("model", "BIC", "delta.BIC")]
kable(model_output, format = "markdown", digits = 3, caption = "BIC, and Delta.BIC for the models. Delta BIC > 7 indicates models that should be dismissed from further consideration.")
| model | BIC | delta.BIC |
|---|---|---|
| Model A | 70155.08 | 6.590 |
| Model B | 70148.49 | 0.000 |
| Model C | 70220.62 | 72.136 |
data(nhanes) # Telling R we want to use this data
modela <- lm(log_monoethyl ~ edu_ref + age + gender + ethnicity + income_to_poverty_ratio +
citizenship, data = nhanes)
summ(modela)
summ(modela, robust = "HC1") #robust standard errors
summ(modela, confint = TRUE, digits = 3) #In many cases, you’ll learn more by looking at confidence intervals than p-values. You can request them from summ. default is 95% CIs
summ(modela, confint = TRUE, pvals = FALSE) #DROP the p values all together
# THE GRAPH
plot_summs(modela)
plot_summs(modela, robust = TRUE)
plot_summs(modela, inner_ci_level = .9)
# plot coefficient uncertainty as normal distributions
plot_summs(modela, plot.distributions = TRUE, inner_ci_level = .9)
# table output for Word and RMarkdown documents
## error is in the parenthesis
export_summs(modela, scale = TRUE)
# confidence intervals instead of standard errors
export_summs(modela, scale = TRUE,
error_format = "[{conf.low}, {conf.high}]")
forest1 <- plot_summs(
point.size = 3,
fontsize=8,
colors = "darkseagreen4",
modela, coefs = c("Household Education College and Beyond
Partial College and Below (ref)" = "edu_refcollege and beyond",
"Age: Young Adult
Child (ref)" = "ageyoung adult",
"Age: Middle-aged
Age:Child (ref)" = "agemiddle-aged",
"Age: Older Adult
Age: Child (ref)" = "ageolder adult",
"Gender: Female
Gender: Male (ref)" = "genderfemale",
"Ethnicity: Non-Hispanic Black
Non-Hispanic White (ref)" = "ethnicityNon-Hispanic Black",
"Ethnicity: Mexican American
Non-Hispanic White (ref)" = "ethnicityMexican American",
"Ethnicity: Other Hispanic
Non-Hispanic White (ref)" = "ethnicityOther Hispanic",
"Ethnicity: Other or Multi
Non-Hispanic White (ref)" = "ethnicityOther or Multi",
"Family Income to Poverty Ratio: 2x Poverty threshold
At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 2x poverty threshold",
"Family Income to Poverty Ratio: 3x Poverty threshold
At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 3x poverty threshold",
"Family Income to Poverty Ratio: 4x Poverty threshold
At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 4x poverty threshold",
"Family Income to Poverty Ratio: 5x Poverty threshold
At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 5x poverty threshold",
"Family Income to Poverty Ratio: more than 5x Poverty threshold
At poverty threshold (ref)" = "income_to_poverty_ratiofamily income more than 5x poverty threshold",
"Citizenship Status: Not U.S. Citizen
U.S. Citizen by birth or naturalization (ref)" = "citizenshipnot U,S, citizen"),
scale = TRUE, robust = TRUE)
forest1
data(nhanes) # Telling R we want to use this data
modelb <- lm(log_monoethyl ~ edu_ref + age + ethnicity + income_to_poverty_ratio +
citizenship, data = nhanes)
summ(modelb)
summ(modelb, robust = "HC1") #robust standard errors
summ(modelb, confint = TRUE, digits = 3) #In many cases, you’ll learn more by looking at confidence intervals than p-values. You can request them from summ. default is 95% CIs
summ(modelb, confint = TRUE, pvals = FALSE) #DROP the p values all together
# THE GRAPH
plot_summs(modelb)
plot_summs(modelb, robust = TRUE)
plot_summs(modelb, inner_ci_level = .9)
# plot coefficient uncertainty as normal distributions
plot_summs(modelb, plot.distributions = TRUE, inner_ci_level = .9)
# table output for Word and RMarkdown documents
## error is in the parenthesis
export_summs(modelb, scale = TRUE)
# confidence intervals instead of standard errors
export_summs(modelb, scale = TRUE,
error_format = "[{conf.low}, {conf.high}]")
forest2 <- plot_summs(
point.size = 3,
fontsize=8,
colors = "darkslategray",
modelb, coefs = c("Household Education College and Beyond
Partial College and Below (ref)" = "edu_refcollege and beyond",
"Age: Young Adult
Child (ref)" = "ageyoung adult",
"Age: Middle-aged
Age:Child (ref)" = "agemiddle-aged",
"Age: Older Adult
Age: Child (ref)" = "ageolder adult",
"Ethnicity: Non-Hispanic Black
Non-Hispanic White (ref)" = "ethnicityNon-Hispanic Black",
"Ethnicity: Mexican American
Non-Hispanic White (ref)" = "ethnicityMexican American",
"Ethnicity: Other Hispanic
Non-Hispanic White (ref)" = "ethnicityOther Hispanic",
"Ethnicity: Other or Multi
Non-Hispanic White (ref)" = "ethnicityOther or Multi",
"Family Income to Poverty Ratio: 2x Poverty threshold
At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 2x poverty threshold",
"Family Income to Poverty Ratio: 3x Poverty threshold
At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 3x poverty threshold",
"Family Income to Poverty Ratio: 4x Poverty threshold
At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 4x poverty threshold",
"Family Income to Poverty Ratio: 5x Poverty threshold
At poverty threshold (ref)" = "income_to_poverty_ratiofamily income 5x poverty threshold",
"Family Income to Poverty Ratio: more than 5x Poverty threshold
At poverty threshold (ref)" = "income_to_poverty_ratiofamily income more than 5x poverty threshold",
"Citizenship Status: Not U.S. Citizen
U.S. Citizen by birth or naturalization (ref)" = "citizenshipnot U,S, citizen"),
scale = TRUE, robust = TRUE)
forest2
vis_miss(nhanes, sort_miss = TRUE)